{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook presents the pre-processing of the poloniex (crypto-currencies) data used to produce the input tensors of the neural network. <br>\n",
    "For each stock, the input is a raw time series of the prices (High, Low, Open, Close). <i>Please note for crypto-currencies, the market never closes, so Close(t) = Open(t+1). </i><br>\n",
    "The output is a matrix of 3 rows and n (number of available data points) columns. <br>\n",
    "The columns correspond to:\n",
    "- High(t-1)/Open(t-1)\n",
    "- Low(t-1)/Open(t-1)\n",
    "- Open(t)/Open(t-1)\n",
    "    \n",
    "<u>Remark:</u> We don't need to normalize the data since it's already of ratio of 2 prices closed to one. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-04T22:02:30.384677Z",
     "start_time": "2018-07-04T22:02:28.286266Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.figure.Figure at 0x10f7046d8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#for navigation in the folders\n",
    "import os\n",
    "import pathlib\n",
    "\n",
    "from time import strptime\n",
    "from datetime import datetime\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "#for plot\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.image as mpimg\n",
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns \n",
    "import PIL\n",
    "import pickle\n",
    "from time import strftime\n",
    "\n",
    "\n",
    "\n",
    "import seaborn as sns\n",
    "sns.despine()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-04-28T22:11:59.059202Z",
     "start_time": "2018-04-28T22:11:59.056226Z"
    }
   },
   "source": [
    "# Processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:30:07.828096Z",
     "start_time": "2018-05-10T18:30:07.817675Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "There are 14 different currencies.\n"
     ]
    }
   ],
   "source": [
    "data_dir = '/poloniex_data/'\n",
    "directory = os.getcwd() + data_dir # path to the files\n",
    "files_tags = os.listdir(directory) #these are the differents pdf files\n",
    "\n",
    "#this is here because hidden files are also shown in the list. \n",
    "for file in files_tags:\n",
    "    if file[0] == '.':\n",
    "        files_tags.remove(file)\n",
    "stock_name = [file.split('.')[0] for file in files_tags]\n",
    "stocks = [file for file in files_tags]\n",
    "print(len(stock_name) == len(stocks))\n",
    "print('There are {} different currencies.'.format(len(stock_name)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:31:40.244710Z",
     "start_time": "2018-05-10T18:31:39.382997Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ETCBTC.csv 17032\n",
      "GNTETH.csv 7001\n",
      "REPETH.csv 13547\n",
      "ETHBTC.csv 33878\n",
      "DOGEBTC.csv 60915\n",
      "ETHUSDT.csv 33876\n",
      "BTCUSDT.csv 42010\n",
      "XRPBTC.csv 51113\n",
      "DASHBTC.csv 60103\n",
      "REPBTC.csv 13547\n",
      "XMRBTC.csv 55285\n",
      "LTCBTC.csv 61096\n",
      "GNTBTC.csv 7001\n",
      "ETCETH.csv 17031\n"
     ]
    }
   ],
   "source": [
    "for s in stocks:\n",
    "    df = pd.read_csv('.'+data_dir+s)\n",
    "    print(s, len(df))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We want roughly 1 year of data. So, we drop the data with less than 17000 rows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:46:22.791273Z",
     "start_time": "2018-05-10T18:46:21.997827Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17031"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kept_stocks = ['ETCBTC.csv', 'ETHBTC.csv', 'DOGEBTC.csv', 'ETHUSDT.csv', 'BTCUSDT.csv', \n",
    "              'XRPBTC.csv', 'DASHBTC.csv', 'XMRBTC.csv', 'LTCBTC.csv', 'ETCETH.csv']\n",
    "len_stocks = list()\n",
    "\n",
    "for s in kept_stocks:\n",
    "    df = pd.read_csv('.'+data_dir+s)\n",
    "    len_stocks.append(len(df))\n",
    "\n",
    "    \n",
    "min_len = np.min(len_stocks)\n",
    "min_len"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:53:41.426068Z",
     "start_time": "2018-05-10T18:53:40.534828Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 11.57it/s]\n"
     ]
    }
   ],
   "source": [
    "list_open = list()\n",
    "list_close = list()\n",
    "list_high = list()\n",
    "list_low = list()\n",
    "\n",
    "for s in tqdm(kept_stocks):\n",
    "    data = pd.read_csv(os.getcwd() + data_dir + s).fillna('bfill').copy()\n",
    "    data = data[['open', 'close', 'high', 'low']]\n",
    "    data = data.tail(min_len)\n",
    "    list_open.append(data.open.values)\n",
    "    list_close.append(data.close.values)\n",
    "    list_high.append(data.high.values)\n",
    "    list_low.append(data.low.values)\n",
    "\n",
    "array_open = np.transpose(np.array(list_open))[:-1]\n",
    "array_open_of_the_day = np.transpose(np.array(list_open))[1:]\n",
    "array_close = np.transpose(np.array(list_close))[:-1]\n",
    "array_high = np.transpose(np.array(list_high))[:-1]\n",
    "array_low = np.transpose(np.array(list_low))[:-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:56:40.155995Z",
     "start_time": "2018-05-10T18:56:40.147580Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17031, 10)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.transpose(np.array(list_low)).shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T18:59:33.142552Z",
     "start_time": "2018-05-10T18:59:33.123914Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3, 10, 17030)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = np.transpose(np.array([\n",
    "    array_high/array_open,\n",
    "    array_low/array_open,\n",
    "    array_open_of_the_day/array_open]), axes=(0, 2, 1))\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The shape corresponds to:\n",
    "- 3: Number of features\n",
    "- 10: Number of stocks\n",
    "- 17030: Number of data points"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-05-10T19:00:22.927409Z",
     "start_time": "2018-05-10T19:00:22.885613Z"
    }
   },
   "outputs": [],
   "source": [
    "np.save('./np_data/inputCrypto.npy', X)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
